语言模型数据集

Note

我们使用H.G.Well的小说《The Time Machine》作为我们训练语言模型的语料库,它相当小只有30000多个单词。
要把文本转化成模型能使用的数据集,一般需要如下几步:

  1. 读取数据

  2. tokenize

  3. 建立词汇表

  4. tokens转换成数字索引

  5. 创建数据集

读取数据

import re
import random
import collections
import torch


#@save
def read_time_machine():
    # 读取《The Time Machine》by H. G. Wells
    lines = open("../data/timemachine.txt").readlines()
    # 非字母都转换成空格、大写字母转小写
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines if lines]

Tokenize

Token是文本的基本组成单元,可以是字符也可以是单词。

#@save
def tokenize(lines, token_type='char'):
    # 把每行分裂成一个个字符或是一个个单词
    if token_type == 'word':
        return [line.split() for line in lines]
    elif token_type == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token_type)

建立词汇表

#@save
class Vocab:
    """tokens的词汇表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        # 特殊的tokens,如<pad>等
        if reserved_tokens is None:
            reserved_tokens = []
        # 统计各个token的出现次数
        counter = collections.Counter([token for line in tokens for token in line])
        # 按出现次数排序
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], 
                                  reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        # 词汇的出现次数需大于等于min_freq
        uniq_tokens += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in uniq_tokens]
        # 索引如何转token、token如何转索引
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        """tokens转化成索引"""
        if not isinstance(tokens, (list, tuple)):
            # 可以直接转
            return self.token_to_idx.get(tokens, self.unk)
        # 递归转
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """索引转化成tokens"""
        if not isinstance(indices, (list, tuple)):
            # 索引不能越界,不然会报错
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

Tokens转化成数字索引

把上面几步连起来。

#@save
def load_corpus_time_machine():
    """获得timemachine语料库与词汇表"""
    # tokenize
    tokens = tokenize(read_time_machine())
    # 建立词汇表
    vocab = Vocab(tokens)
    # 转化为List[int]
    corpus = [vocab[token] for line in tokens for token in line if vocab[token] != 0]
    return corpus, vocab

创建数据集

实现一个读取corpus,生成batches的函数。

#@save
class TimeMachineDataLoader:
    """生成timemachine数据集"""
    def __init__(self, batch_size, num_steps):
        # 读取上一步的结果
        self.corpus, self.vocab = load_corpus_time_machine()
        # batch_size: 每个batch的样本数
        # num_steps: 每个样本的token数,也是索引数
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        # 加点随机性,从offset开始读
        offset = random.randint(0, self.num_steps - 1)
        num_tokens = ((len(self.corpus) - offset - 1) // self.batch_size) * self.batch_size
        # shape: (batch_size, -1)
        # 要预测下一个token,所以要Ys中要+1
        Xs = torch.tensor(self.corpus[offset: offset + num_tokens]
                         ).reshape(self.batch_size, -1)
        Ys = torch.tensor(self.corpus[offset + 1: offset + 1 + num_tokens]
                         ).reshape(self.batch_size, -1)
        # 计算batch数
        num_batches = Xs.shape[1] // self.num_steps
        for i in range(0, self.num_steps * num_batches, self.num_steps):
            # 相应列的内容
            X = Xs[:, i: i + self.num_steps]
            Y = Ys[:, i: i + self.num_steps]
            yield X, Y

合起来

#@save
def load_data_time_machine(batch_size, num_steps):
    """读取timemachine数据集和词汇表"""
    data_iter = TimeMachineDataLoader(batch_size, num_steps)
    return data_iter, data_iter.vocab
# shape is (batch_size, num_steps)
data_iter, vocab = load_data_time_machine(2, 5)
for x, y in data_iter:
    print(x)
    print(y)
    break
tensor([[ 2,  1,  3,  5, 13],
        [ 9,  4,  3,  1,  3]])
tensor([[ 1,  3,  5, 13,  2],
        [ 4,  3,  1,  3,  9]])